
pacman::p_load(tidyverse,
               fst,
               tidymodels,
               doParallel,
               embed,
               tictoc,
               lubridate,
               glmnet,
               glue,
               glmnetUtils
)

path_smbinning <- paste0("../../data/pipeline_outputs/", 
                         SPECIAL_SUFFIX, "/", "smbinning_", TRAINING_SUFFIX,
                         "_", RAND_NO_CID_SMALLEST_LARGEST, "/")


v_lasso_vars <- read_csv(paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, 
                                "/lasso_selected_vars_", TRAINING_SUFFIX, "_",
                                RAND_NO_CID_SMALLEST_LARGEST, "/",
                                "df_lasso_vars_and_dict.csv")) %>% 
  pull(Variable)

df_training_raw <- read_fst(paste0("../../data/pipeline_outputs/", 
                                   SPECIAL_SUFFIX, "/", "train_",
                                   TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "/",
                                   "training.fst"),
                            columns = c("cid", "qtr", "t_default", "rand_no_cid", all_of(v_lasso_vars), "consumer_category")) %>% 
  filter(consumer_category %in% CONSUMER_GROUP) %>% 
  select(-consumer_category)

df_training_outcome <- df_training_raw %>% 
  mutate(
    year = year(qtr),
    year_outcome = (10 * year) + t_default,
    t_default = factor(t_default, levels = c(0, 1))
  ) 

v_categorical <- df_training_outcome %>% 
  select(contains("VARNAMEPART")) %>%
  select(where(is.character)) %>% # or if factor? 
  names()



df_numeric <- df_training_outcome %>%
  select(contains("VARNAMEPART"), -all_of(v_categorical)) 

v_var_less_than_5_unique <- df_numeric %>% 
  pivot_longer(everything()) %>%
  group_by(name) %>%
  summarize(
    n_distinct = n_distinct(value)
  ) %>%
  filter(n_distinct < 5) %>%
  pull(name)

v_var_5_or_more_unique <- setdiff(names(df_numeric), v_var_less_than_5_unique)

v_concat_dummies <- c(v_categorical, v_var_less_than_5_unique)
v_union_dummies <- union(v_var_less_than_5_unique, v_categorical)

stopifnot("There should not be overlap between categorical and numeric vars." =  
            length(v_concat_dummies) == length(v_union_dummies))

v_vars_to_dummy <- v_union_dummies
v_vars_to_discretize_and_dummy <- v_var_5_or_more_unique

df_v_var_less_than_5_unique <- tibble("Variable" = v_var_less_than_5_unique)

write_csv(df_v_var_less_than_5_unique, paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "train_binned_", 
                                            TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "/",
                                            "vars_less_than_5_unique.csv"))

v_var_to_bin <- df_training_outcome %>% 
  select(contains("VARNAMEPART")) %>% 
  names() %>% 
  setdiff(., c(v_categorical, v_var_less_than_5_unique))

df_training_factor <- df_training_outcome %>% 
  mutate(
    across(all_of(v_var_less_than_5_unique), ~ as.factor(.))
  )

v_numeric_attr <- df_training_factor %>% 
  select(contains("VARNAMEPART")) %>% 
  select(where(~ is.numeric(.))) %>% 
  names()

v_non_numeric_attr <- setdiff(v_lasso_vars, v_numeric_attr)

write_csv(tibble("Variable" = v_numeric_attr), paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "train_binned_", 
                                 TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "/", "numeric_predictors.csv"))

write_csv(tibble("Variable" = v_non_numeric_attr), paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "train_binned_", 
                                                      TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "/", "non_numeric_predictors.csv"))


df_training <- df_training_factor %>% 
  mutate(
    across(.cols = all_of(v_numeric_attr), ~ ifelse(is.na(.), -999999, .)),
    across(.cols = all_of(v_non_numeric_attr), ~ fct_explicit_na(., "missing"))
  )




for (i in 1:length(v_numeric_attr)) {
  
  var_ <- v_numeric_attr[i]
  
  file_name_smb <- dir_ls(path_smbinning) %>% 
    str_subset(glue("{var_}.rds"))
  
  smbinning_result <- read_rds(file_name_smb)
  
  col_ <- df_training[, var_]
  
  
  if(all(smbinning_result == "No significant splits")) {
    
    if (n_distinct(df_training[, var_]) < 5) {
      df_training[, var_] <- as.character(col_) 
    } else {
      cuts_ <- unique(c(-Inf, quantile(df_training[, var_]), Inf))
      df_training[, var_] <- cut(col_, cuts_, right = T, include.lowest = T) 
    }
  } else {
    
    cuts_ <- c(-Inf, smbinning_result$cuts, Inf)
    df_training[, var_] <- cut(col_, cuts_, right = T, include.lowest = T) 
    
  }
  print(glue("{i}/{length(v_numeric_attr)}"))
  
}





discretization_formula <- df_training %>% 
  select(cid, qtr, year, year_outcome, rand_no_cid, contains("VARNAMEPART")) %>%  
  names() %>% 
  reformulate(termlabels = ., response = "t_default")

set.seed(1234)

tic(msg = "Discretizing Variables")

recipe_binning <-
  recipe(formula = discretization_formula, data = df_training) %>% 
  update_role(cid, qtr, rand_no_cid, year, year_outcome, new_role = "id") %>% 
  #step_discretize_xgb(all_of(v_var_to_bin), outcome = "t_default", id = "xgb_bins") %>%
  step_novel(all_of(c(v_var_to_bin, v_var_less_than_5_unique, v_categorical)), - all_outcomes()) %>% 
  step_dummy(all_of(c(v_var_to_bin, v_var_less_than_5_unique, v_categorical)), one_hot = FALSE) %>%
  step_intercept() %>% 
  prep(., strings_as_factors = FALSE) 

toc(log = TRUE)

saveRDS(recipe_binning, paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "final_model_logistic_",
                               TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "_", MODEL_METRIC, "/",
                               "discretization_recipe.rds"))

# df_training_bins <- tidy(recipe_binning, id = "xgb_bins")
# 
# write_csv(df_training_bins, paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "train_binned_", 
#                                      TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "/",
#                                      "bins.csv"))

df_training_binned <- bake(recipe_binning, new_data = df_training)

v_attr <- names(df_training_binned) %>% 
  str_subset("VARNAMEPART")

f_ <- as.formula(paste0("t_default ~ ", paste0(v_attr, collapse = " + ")))

set.seed(1234)

tic(msg = "Fitting Logistic")
fit_logistic <- cv.glmnet(f_, data = df_training_binned, nfolds = 5, alpha = 0,
                          keep = TRUE, family = "binomial")
toc(log = TRUE)

s_ <- fit_logistic$lambda.min
v_fitted <- predict(fit_logistic, df_training_binned, type = "response", s = s_)[, 1]

best_model <- tibble(
  alpha = 0,
  lambda = s_
)

write_csv(best_model, paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "final_model_logistic_",
                             TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "_", MODEL_METRIC, "/", 
                             "optimal_hyperparameters.csv"))



saveRDS(fit_logistic, paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "final_model_logistic_",
                             TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "_", MODEL_METRIC, "/",
                             "final_model.rds"))

df_fitted <-  df_training %>% 
  transmute(
    cid, 
    qtr,
    qtr_rand_no_cid = paste0(str_replace_all(qtr, "-", ""), "_", rand_no_cid),
    v_fitted = v_fitted
  )

v_unique_qtr_rand_no_cid <- df_fitted %>% 
  pull(qtr_rand_no_cid) %>% 
  unique()

Save_Fitted_Values <- function(df_fitted_values, qtr_rand_no_cids) {
  
  df_fitted_values %>% 
    filter(qtr_rand_no_cid == qtr_rand_no_cids) %>% 
    select(cid, qtr, v_fitted) %>% 
    write_fst(., paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "fitted_training_logistic_",
                        TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "_", MODEL_METRIC,
                        "/" , "fitted_", qtr_rand_no_cids, ".fst"))
  
}

tic(msg = "Saving Logistic Fitted Values")
walk(v_unique_qtr_rand_no_cid, ~ Save_Fitted_Values(df_fitted, .))
toc(log = TRUE)

df_timing_log <- tic.log() %>% 
  unlist() %>%
  tibble(Process = str_extract(., pattern = ".+(?=:)"),
         Time = str_extract(., pattern = "(?<=: ).+")) %>% 
  select(Process, Time)

tic.clear()
tic.clearlog()

write_csv(df_timing_log, paste0("../../data/pipeline_outputs/", 
                                SPECIAL_SUFFIX, "/", "timing_logistic_",
                                TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "_", MODEL_METRIC, 
                                "/", "timing_log.csv"))
